In [1]:
import pandas as pd
import numpy as np
import scipy as sy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.units as munits
import matplotlib as mpl
from scipy.special import ndtri
import matplotlib.dates as mdates
import plotly.graph_objects as go
import plotly.express as px


import warnings
import scipy.stats as st
import statsmodels.api as sm
import matplotlib
In [2]:
#creating a dataframe with timestamp
t_index = pd.DatetimeIndex(pd.date_range(start='2019-10-10', end='2020-06-06 23:00:00', freq="1h"))
df = pd.DataFrame(t_index, columns=['hour'])
df
Out[2]:
hour
0 2019-10-10 00:00:00
1 2019-10-10 01:00:00
2 2019-10-10 02:00:00
3 2019-10-10 03:00:00
4 2019-10-10 04:00:00
... ...
5779 2020-06-06 19:00:00
5780 2020-06-06 20:00:00
5781 2020-06-06 21:00:00
5782 2020-06-06 22:00:00
5783 2020-06-06 23:00:00

5784 rows × 1 columns

In [4]:
# Import data using datetime and no data value
hourly = pd.read_csv('Book1.csv')
hourly
Out[4]:
Biogas_yield Methane hour
0 24.369362 57.197067 10/10/2019 0:00
1 23.942646 57.182819 10/10/2019 1:00
2 24.668522 57.191166 10/10/2019 2:00
3 23.575047 57.181984 10/10/2019 3:00
4 22.973997 57.180664 10/10/2019 4:00
... ... ... ...
5779 13.000000 NaN 6/06/2020 19:00
5780 14.000000 NaN 6/06/2020 20:00
5781 14.000000 NaN 6/06/2020 21:00
5782 13.000000 NaN 6/06/2020 22:00
5783 13.000000 NaN 6/06/2020 23:00

5784 rows × 3 columns

In [11]:
fig = px.line(hourly, x=hourly.hour, y=hourly.Biogas_yield, title="Daily biogas production")
fig.show()
In [5]:
extracted_col = hourly["Biogas_yield"]
extracted_col
Out[5]:
0       24.369362
1       23.942646
2       24.668522
3       23.575047
4       22.973997
          ...    
5779    13.000000
5780    14.000000
5781    14.000000
5782    13.000000
5783    13.000000
Name: Biogas_yield, Length: 5784, dtype: float64
In [6]:
df1 = df.join(extracted_col)
df1
Out[6]:
hour Biogas_yield
0 2019-10-10 00:00:00 24.369362
1 2019-10-10 01:00:00 23.942646
2 2019-10-10 02:00:00 24.668522
3 2019-10-10 03:00:00 23.575047
4 2019-10-10 04:00:00 22.973997
... ... ...
5779 2020-06-06 19:00:00 13.000000
5780 2020-06-06 20:00:00 14.000000
5781 2020-06-06 21:00:00 14.000000
5782 2020-06-06 22:00:00 13.000000
5783 2020-06-06 23:00:00 13.000000

5784 rows × 2 columns

In [7]:
df1['hour'] = pd.to_datetime(df['hour'])
daily = df1.set_index('hour')
In [10]:
daily = df1.resample('D', on='hour').sum()
daily.columns
Out[10]:
Index(['Biogas_yield'], dtype='object')
In [12]:
#Batch2 input: 
#292.6 tonnes of RG (DM: 34.1 ± 8.4%)
#30.64 tonnes of inoculum (Biogas potential of inoculum is 30 m3/ton)
#Approximately 162 tonnes of residual grass from Batch1 (Biogas potential: 64 m3/ton)

daily
daily['Biogas_yield_FU'] = (daily['Biogas_yield'] / 485.24).round(2)
daily.to_csv('daily.csv', sep=',')
daily.describe()
Out[12]:
Biogas_yield Biogas_yield_FU
count 241.000000 241.000000
mean 197.227960 0.406266
std 151.319908 0.311681
min 0.000000 0.000000
25% 0.000000 0.000000
50% 304.000000 0.630000
75% 316.000000 0.650000
max 339.000000 0.700000
In [9]:
df_latest = pd.read_csv('daily.csv', sep=',')
df_latest.describe()
Out[9]:
Biogas_yield Biogas_yield_FU
count 241.000000 241.000000
mean 197.227960 0.610415
std 151.319908 0.468295
min 0.000000 0.000000
25% 0.000000 0.000000
50% 304.000000 0.940000
75% 316.000000 0.980000
max 339.000000 1.050000
In [10]:
fig = px.line(df_latest, x=df_latest.hour, y=df_latest.Biogas_yield, title="Daily biogas production")
fig.show()
In [19]:
#Batch2 input: 292.6 tonnes of RG (DM: 34.1 ± 8.4%)
#30.64 tonnes of inoculum

df_cumulative = daily['Biogas_yield'].cumsum()
df_updated = pd.DataFrame(df_cumulative, columns=['Biogas_yield'])
df_updated
df_updated['Biogas_yield_FU'] = (df_updated['Biogas_yield'] / 323.24).round(2)
df_updated.describe()
Out[19]:
Biogas_yield Biogas_yield_FU
count 241.000000 241.000000
mean 15259.818143 47.210249
std 15774.461101 48.799595
min 334.938475 1.040000
25% 334.938475 1.040000
50% 10151.938475 31.410000
75% 28806.938475 89.120000
max 47531.938475 147.050000
In [21]:
df_updated
df_updated.describe()
Out[21]:
Biogas_yield Biogas_yield_FU
count 241.000000 241.000000
mean 15259.818143 47.210249
std 15774.461101 48.799595
min 334.938475 1.040000
25% 334.938475 1.040000
50% 10151.938475 31.410000
75% 28806.938475 89.120000
max 47531.938475 147.050000
In [22]:
time_series = pd.read_csv('daily.csv')
time_series.describe()
Out[22]:
Biogas_yield Biogas_yield_FU
count 241.000000 241.000000
mean 197.227960 0.610415
std 151.319908 0.468295
min 0.000000 0.000000
25% 0.000000 0.000000
50% 304.000000 0.940000
75% 316.000000 0.980000
max 339.000000 1.050000
In [13]:
Total = time_series["Biogas_yield_FU"]
Total_kde = Total.plot.kde(figsize=(20,10)) 
pd.DataFrame(Total).describe()
Out[13]:
Biogas_yield_FU
count 241.000000
mean 0.610415
std 0.468295
min 0.000000
25% 0.000000
50% 0.940000
75% 0.980000
max 1.050000
In [14]:
time_series.columns
Out[14]:
Index(['hour', 'Biogas_yield', 'Biogas_yield_FU'], dtype='object')
In [15]:
#fig = px.line(daily, x=time_series.hour, y=time_series.Biogas_yield_FU, title='Daily Biogas production')
fig = px.line(time_series, x=time_series.hour, y=time_series.Biogas_yield, title='Daily Biogas production')
fig.update_layout(xaxis_title="Date", yaxis_title="Daily biogas yield (m3/day)", width=1000)
fig.show()
In [16]:
data = time_series.drop(['hour', 'Biogas_yield_FU'], axis=1)
data
Out[16]:
Biogas_yield
0 334.938475
1 0.000000
2 0.000000
3 0.000000
4 0.000000
... ...
236 317.000000
237 316.000000
238 323.000000
239 301.000000
240 307.000000

241 rows × 1 columns

In [17]:
# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Distributions to check
    DISTRIBUTIONS = [        
        st.dgamma,st.dweibull,st.expon,st.exponnorm,st.gamma,st.lognorm,st.norm,st.t,st.triang,
        st.uniform
    ]

    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_sse = np.inf

    # Estimate distribution parameters from data
    for distribution in DISTRIBUTIONS:

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                    end
                except Exception:
                    pass

                # identify if this distribution is better
                if best_sse > sse > 0:
                    best_distribution = distribution
                    best_params = params
                    best_sse = sse

        except Exception:
            pass

    return (best_distribution.name, best_params)

def make_pdf(dist, params, size=10000):
    """Generate distributions's Probability Distribution Function """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf
In [18]:
# Plot for comparison
plt.figure(figsize=(20,15))
ax = data.plot(kind='hist', bins=200, density=True, alpha=0.5, color = [color['color'] for color in list(mpl.rcParams['axes.prop_cycle'])])
# Save plot limits
dataYLim = ax.get_ylim()

# Find best fit distribution
best_fit_name, best_fit_params = best_fit_distribution(data, 200, ax)
best_dist = getattr(st, best_fit_name)

# Update plots
ax.set_ylim(dataYLim)
ax.set_title(u'Biogas yield.\n Vanheede ')
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')

# Make PDF with best params 
pdf = make_pdf(best_dist, best_fit_params)

# Display
plt.figure(figsize=(20,15))
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=50, density=True, alpha=0.5, label='Data', legend=True, ax=ax)

param_names = (best_dist.shapes + ', loc, scale').split(', ') if best_dist.shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_fit_params)])
dist_str = '{}({})'.format(best_fit_name, param_str)

ax.set_title(u'Biogas yield. with best fit distribution \n' + dist_str)
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')
Out[18]:
Text(0, 0.5, 'Frequency')
<Figure size 1440x1080 with 0 Axes>

image.png

In [ ]: